From b18f38d8129360973c360db8a53b48c56dc73408 Mon Sep 17 00:00:00 2001
From: =?utf8?q?=C3=98yvind=20Kol=C3=A5s?= <pippin@gimp.org>
Date: Thu, 7 Sep 2017 00:29:59 +0200
Subject: [PATCH] extensions: make float-half extension use exact LUTs

Both for conversion from half to float with a on load initialized table, and
with new tables an faster approach from qcms / mozilla / webkit.
---
 extensions/float-half.c | 161 +++++++++++++++++++++++++++-------------
 1 file changed, 109 insertions(+), 52 deletions(-)

diff --git a/extensions/float-half.c b/extensions/float-half.c
index 08b7dfb..b471a02 100644
--- a/extensions/float-half.c
+++ b/extensions/float-half.c
@@ -75,7 +75,7 @@
 #include "babl.h"
 #include "extensions/util.h"
 
-static void halfp2singles(void *target, const void *source, long numel)
+static void halfp2singles_fun(void *target, const void *source, long numel)
 {
     uint16_t *hp = (uint16_t *) source; // Type pun input as an unsigned 16-bit int
     uint32_t *xp = (uint32_t *) target; // Type pun output as an unsigned 32-bit int
@@ -122,59 +122,107 @@ static void halfp2singles(void *target, const void *source, long numel)
     }
 }
 
+static float half_float_table[65536];
+
+static void halfp2singles(void *target, const void *source, long numel)
+{
+  uint16_t *src = (uint16_t *) source;
+  float *dst = (float *) target;
+  int i;
+  for (i = 0; i < numel; i++)
+  {
+    dst[i] = half_float_table[src[i]];
+  }
+}
+
+/* from table based approach from qcms/blink/webkit  */
+
+const unsigned short half_float_base_table[512] = {
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+0,0,0,0,0,0,0,1,2,4,8,16,32,64,128,256,
+512,1024,2048,3072,4096,5120,6144,7168,8192,9216,10240,11264,12288,13312,14336,15360,
+16384,17408,18432,19456,20480,21504,22528,23552,24576,25600,26624,27648,28672,29696,30720,31744,
+31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,
+31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,
+31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,
+31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,
+31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,
+31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,
+31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,31744,
+32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,
+32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,
+32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,
+32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,
+32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,
+32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,
+32768,32768,32768,32768,32768,32768,32768,32769,32770,32772,32776,32784,32800,32832,32896,33024,
+33280,33792,34816,35840,36864,37888,38912,39936,40960,41984,43008,44032,45056,46080,47104,48128,
+49152,50176,51200,52224,53248,54272,55296,56320,57344,58368,59392,60416,61440,62464,63488,64512,
+64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,
+64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,
+64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,
+64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,
+64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,
+64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,
+64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512,64512
+};
+
+const unsigned char half_float_shift_table[512] = {
+24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,
+24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,
+24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,
+24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,
+24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,
+24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,
+24,24,24,24,24,24,24,23,22,21,20,19,18,17,16,15,
+14,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
+13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,24,
+24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,
+24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,
+24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,
+24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,
+24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,
+24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,
+24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,13,
+24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,
+24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,
+24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,
+24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,
+24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,
+24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,
+24,24,24,24,24,24,24,23,22,21,20,19,18,17,16,15,
+14,13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,
+13,13,13,13,13,13,13,13,13,13,13,13,13,13,13,24,
+24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,
+24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,
+24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,
+24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,
+24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,
+24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,
+24,24,24,24,24,24,24,24,24,24,24,24,24,24,24,13
+};
+
+static inline unsigned short float_to_half_float(float f)
+{
+  // See Blink::Source/platform/graphics/gpu/WebGLImageConversion.cpp::convertFloatToHalfFloat() and http://crbug.com/491784
+  union {
+    float    f;
+    uint32_t u;
+  } u = {f};
+  unsigned temp = u.u;
+  unsigned signexp = (temp >> 23) & 0x1ff;
+  return half_float_base_table[signexp] + ((temp & 0x007fffff) >> half_float_shift_table[signexp]);
+}
+
 static void singles2halfp(void *target, const void *source, long numel)
 {
-    uint16_t *hp = (uint16_t *) target; // Type pun output as an unsigned 16-bit int
-    uint32_t *xp = (uint32_t *) source; // Type pun input as an unsigned 32-bit int
-    uint16_t    hs, he, hm;
-    uint32_t x, xs, xe, xm;
-    int hes;
-    
-    if( source == NULL || target == NULL ) { // Nothing to convert (e.g., imag part of pure real)
-        return;
-    }
-    while( numel-- ) {
-        x = *xp++;
-        if( (x & 0x7FFFFFFFu) == 0 ) {  // Signed zero
-            *hp++ = (uint16_t) (x >> 16);  // Return the signed zero
-        } else { // Not zero
-            xs = x & 0x80000000u;  // Pick off sign bit
-            xe = x & 0x7F800000u;  // Pick off exponent bits
-            xm = x & 0x007FFFFFu;  // Pick off mantissa bits
-            if( xe == 0 ) {  // Denormal will underflow, return a signed zero
-                *hp++ = (uint16_t) (xs >> 16);
-            } else if( xe == 0x7F800000u ) {  // Inf or NaN (all the exponent bits are set)
-                if( xm == 0 ) { // If mantissa is zero ...
-                    *hp++ = (uint16_t) ((xs >> 16) | 0x7C00u); // Signed Inf
-                } else {
-                    *hp++ = (uint16_t) 0xFE00u; // NaN, only 1st mantissa bit set
-                }
-            } else { // Normalized number
-                hs = (uint16_t) (xs >> 16); // Sign bit
-                hes = ((int)(xe >> 23)) - 127 + 15; // Exponent unbias the single, then bias the halfp
-                if( hes >= 0x1F ) {  // Overflow
-                    *hp++ = (uint16_t) ((xs >> 16) | 0x7C00u); // Signed Inf
-                } else if( hes <= 0 ) {  // Underflow
-                    if( (14 - hes) > 24 ) {  // Mantissa shifted all the way off & no rounding possibility
-                        hm = (uint16_t) 0u;  // Set mantissa to zero
-                    } else {
-                        xm |= 0x00800000u;  // Add the hidden leading bit
-                        hm = (uint16_t) (xm >> (14 - hes)); // Mantissa
-                        if( (xm >> (13 - hes)) & 0x00000001u ) // Check for rounding
-                            hm += (uint16_t) 1u; // Round, might overflow into exp bit, but this is OK
-                    }
-                    *hp++ = (hs | hm); // Combine sign bit and mantissa bits, biased exponent is zero
-                } else {
-                   he = (uint16_t) (hes << 10); // Exponent
-                    hm = (uint16_t) (xm >> 13); // Mantissa
-                    if( xm & 0x00001000u ) // Check for rounding
-                        *hp++ = (hs | he | hm) + (uint16_t) 1u; // Round, might overflow to inf, this is OK
-                    else
-                        *hp++ = (hs | he | hm);  // No rounding
-                }
-            }
-        }
-    }
+  const float *src = source;
+  uint8_t     *dst = target;
+  int i;
+  for (i = 0; i < numel; i++)
+    dst[i] = float_to_half_float (src[i]);
 }
 
 static inline long
@@ -232,6 +280,7 @@ int init (void);
 int
 init (void)
 {
+  int i;
   const Babl *rgbaF_linear = babl_format_new (
     babl_model ("RGBA"),
     babl_type ("float"),
@@ -337,6 +386,14 @@ init (void)
     babl_component ("Y'"),
     NULL);
 
+  for (i = 0; i < 65536; i++)
+  {
+    uint16_t buf[2] = {i, i};
+    float   fbuf[2];
+    halfp2singles_fun(fbuf, buf, 1);
+    half_float_table[i] = fbuf[0];
+  }
+
 #define CONV(src, dst) \
 { \
   babl_conversion_new (src ## _linear, dst ## _linear, "linear", conv_ ## src ## _ ## dst, NULL); \
-- 
2.30.2